#PACKAGES
packages <- c("sp.scRNAseq", "sp.scRNAseqData", "ggthemes", "tidyverse", "seqTools")
purrr::walk(packages, library, character.only = TRUE)
rm(packages)

Goal

Identify DE genes between colon and SI stem cells via single cell RNA-seq data from mouse. This was perfored using 2 different strategies: a) pairwise comparison of all cell types as defined by unsupervised classification and b) comparison of the top 50 Lgr5+ colon vs small intestine cells from the stem cell population (as defined above).

Run unsupervised classification

First we run unsupervised classificaiton using t-SNE and the Mclust software.

#setup spCounts
s <- str_detect(colnames(countsMgfp), "^s")
countsMgfpMeta <- countsMgfpMeta %>%
mutate(mouseName = if_else(
  str_detect(plate, "NJA004"),
  "C57black/6J", "C57black/6J/Lgr5gfp"
  ))
 m <- colnames(countsMgfp) %in% filter(countsMgfpMeta, mouseName == "C57black/6J")$sample

cObjSng <- spCounts(countsMgfp[, s & m], cbind(countsMgfpERCC[, s & m]))
cObjMul <- spCounts(countsMgfp[, !s & m], cbind(countsMgfpERCC[, !s & m]))

#spUnsupervised
load("~/Github/sp.scRNAseqTesting/inst/differentialExpLgr5ColonSI/uObj.rda")

Plot unsupervised classification results.

plotUnsupervisedClass(uObj, cObjSng)

Show marker expression specific for the cell types in the tissue.

plotUnsupervisedMarkers(
  uObj, cObjSng,
  c("Lgr5", "Muc2", "Ptprc", "Chga", "Alpi", "Lyz1", "Dclk1", "Slc40a1"),
  pal = RColorBrewer::brewer.pal(8, "Set1")
)

Show the tissues.

plotUnsupervisedClass(uObj, cObjSng) %>%
plotData() %>%
inner_join(countsMgfpMeta, by = c("Sample" = "sample")) %>%
ggplot() +
geom_point(aes(`t-SNE dim 1`, `t-SNE dim 2`, colour = tissue)) +
theme_few() +
theme(legend.position = "top") +
guides(colour = guide_legend(title = "Tissue"))

Rename the classes

classifications <- tibble(
  sample = rownames(getData(uObj, "tsne")),
  oldClass = getData(uObj, "classification")
) %>%
mutate(newClass = case_when(
  oldClass == "C1" ~ "C.Stem",
  oldClass == "F1" ~ "C.Colonocyte",
  oldClass == "B1" ~ "C.Goblet",
  oldClass == "G1" ~ "Blood.Tufft.Endocrine",
  oldClass == "D1" ~ "SI.Stem",
  oldClass == "A1" ~ "SI.Enterocyte",
  oldClass == "E1" ~ "SI.Goblet",
  TRUE ~ "error"
))

classification(uObj) <- classifications$newClass[match(rownames(getData(uObj, "tsne")), classifications$sample)]
groupMeans(uObj) <- averageGroupExpression(cObjSng, getData(uObj, "classification"), FALSE)
tsneMeans(uObj) <- tsneGroupMeans(getData(uObj, "tsne"), getData(uObj, "classification"))
plotUnsupervisedClass(uObj, cObjSng)

Show Lgr5 expression.

plotUnsupervisedMarkers(uObj, cObjSng, "Lgr5")

Expression is log2(cpm) and then scaled to the interval [0, 1] with: (exp - min(exp)) / (max(exp) - min(exp)).

Show Lgr5 expression per class and tissue.

plotUnsupervisedMarkers(uObj, cObjSng, "Lgr5") %>%
plotData() %>%
inner_join(countsMgfpMeta, by = c("Sample" = "sample")) %>%
ggplot() +
geom_histogram(
  aes(Lgr5, fill = tissue, y = ..count.., group = tissue), binwidth = 0.05, position = position_dodge(),
  alpha = 0.7
) +
facet_wrap(~Classification, scales = "free_y") +
theme_bw() +
theme(legend.position = "top", plot.caption = element_text(hjust = 0)) +
guides(fill = guide_legend(title = "Tissue")) +
labs(
  x = "Lgr5 expression: log2(cpm) scaled to the interval [0, 1] with: (exp - min(exp)) / (max(exp) - min(exp))."
)

Differential expression strategy 1

Pairwise KStest using all classes.

ks.res <- KStest(getData(cObjSng, "counts.cpm"), getData(uObj, "classification"), cores = 8)
p.ks.res <- processKStest(ks.res, getData(uObj, "classification"), 0.05) %>%
filter(sigBool) %>%
group_by(gene) %>%
filter(n() == 1) %>%
ungroup() %>%
mutate(p.prod = map_dbl(p.values, prod)) %>%
mutate(meanCpm = map2_dbl(id, gene, function(i, g) {
  mean(getData(cObjSng, "counts.cpm")[g, getData(uObj, "classification") == i])
})) %>%
mutate(log2.fc = map2_dbl(id, gene, function(i, g) {
  log2(
    mean(getData(cObjSng, "counts.cpm")[g, getData(uObj, "classification") == i]) /
    mean(getData(cObjSng, "counts.cpm")[g, getData(uObj, "classification") != i])
  )
})) %>%
arrange(desc(log2.fc), p.prod, desc(statSum), desc(meanCpm))

save(p.ks.res, file = './differentialExpressionStrategy1.rda', compress = "bzip2")
openxlsx::write.xlsx(p.ks.res, file = "./differentialExpressionStrategy1.xlsx")

Number of significant genes detected at alpha 0.05?

p.ks.res %>% count(id)
## # A tibble: 7 x 2
##   id                        n
##   <chr>                 <int>
## 1 Blood.Tufft.Endocrine   374
## 2 C.Colonocyte            133
## 3 C.Goblet                 90
## 4 C.Stem                   38
## 5 SI.Enterocyte           213
## 6 SI.Goblet                70
## 7 SI.Stem                 158

Number of significant genes detected at alpha 0.01?

processKStest(ks.res, getData(uObj, "classification"), 0.01) %>%
filter(sigBool) %>%
group_by(gene) %>%
filter(n() == 1) %>%
ungroup() %>%
mutate(p.prod = map_dbl(p.values, prod)) %>%
count(id)
## # A tibble: 7 x 2
##   id                        n
##   <chr>                 <int>
## 1 Blood.Tufft.Endocrine   240
## 2 C.Colonocyte             95
## 3 C.Goblet                 56
## 4 C.Stem                   25
## 5 SI.Enterocyte           126
## 6 SI.Goblet                39
## 7 SI.Stem                  92

Number of significant genes detected at alpha 0.001?

processKStest(ks.res, getData(uObj, "classification"), 0.001) %>%
filter(sigBool) %>%
group_by(gene) %>%
filter(n() == 1) %>%
ungroup() %>%
mutate(p.prod = map_dbl(p.values, prod)) %>%
count(id)
## # A tibble: 7 x 2
##   id                        n
##   <chr>                 <int>
## 1 Blood.Tufft.Endocrine   149
## 2 C.Colonocyte             59
## 3 C.Goblet                 31
## 4 C.Stem                   22
## 5 SI.Enterocyte            74
## 6 SI.Goblet                21
## 7 SI.Stem                  67

Number of significant genes detected at alpha 0.05 and fold change > 2?

p.ks.res %>%
filter(log2.fc > 1 | log2.fc < -1) %>%
count(id)
## # A tibble: 7 x 2
##   id                        n
##   <chr>                 <int>
## 1 Blood.Tufft.Endocrine   347
## 2 C.Colonocyte             79
## 3 C.Goblet                 77
## 4 C.Stem                   17
## 5 SI.Enterocyte           126
## 6 SI.Goblet                60
## 7 SI.Stem                  66

Plot some candidates in the context of all cells.

#dotplot function
dotplot <- function(data) {
  data %>%
  rename(Gene = gene) %>%
  ggplot() +
  geom_dotplot(
    data = . %>% filter(expression != 0),
    aes(Classification, expression, colour = Gene, fill = Gene, group = interaction(Classification, Gene)),
    binaxis="y", stackdir="center", binwidth = 0.01, position = position_dodge()
  ) +
  geom_text(
    data = . %>%
      group_by(Classification, Gene) %>%
      summarize(do = paste0(round(length(which(expression == 0)) / n(), digits = 2) * 100, "%")),
    aes(Classification, 0.05, label = do), size = 2
  ) +
  geom_point(
    data = . %>%
      group_by(Gene, Classification) %>%
      summarize(median = median(expression)) %>%
      filter(median != 0),
    aes(Classification, median), shape = 95, size = 3
  ) +
  ylim(0, 1) +
  theme_few() +
  theme(axis.text.x = element_text(angle = 90), legend.position = "top") +
  facet_wrap(~Gene) +
  labs(
    x = "Tissue",
    y = "log2(expression)[0, 1]"
  ) +
  guides(fill = FALSE, colour = FALSE)
}

SI stem genes

#plot SI candidates
p.ks.res %>%
filter(id == "SI.Stem") %>%
arrange(desc(log2.fc), p.prod, desc(statSum), desc(meanCpm)) %>%
slice(1:30) %>%
pull(gene) %>%
plotUnsupervisedMarkers(uObj, cObjSng, .) %>%
plotData() %>%
gather(gene, expression, -(Sample:Colour)) %>%
dotplot()

Colon stem genes

p.ks.res %>%
filter(id == "C.Stem") %>%
arrange(desc(log2.fc), p.prod, desc(statSum), desc(meanCpm)) %>%
slice(1:30) %>%
pull(gene) %>%
plotUnsupervisedMarkers(uObj, cObjSng, .) %>%
plotData() %>%
gather(gene, expression, -(Sample:Colour)) %>%
dotplot()

Results were saved to the file “differentialExpressionStrategy1” with the following columns:

Id: the cell type that the corresponding gene has been found to be differentially expressed in. Gene: the name of the gene sigBool: a logical indicating if all pairwise tests were significant (alpha = 0.001) statSum: the sum of test statistics from the KS tests p.values: the individual p values from each pairwise test that was performed for the corresponding gene Statistics: the individual test statistics from each pairwise test that was performed for the corresponding gene p.prod: the product of the p.values from all pairwise tests meanCpm: the mean counts per million expression values for the gene in the cell type listed in the “id” column Log2.fc: the log 2 fold change comparing the cell type listed in the “id” column vs all other cell types as a group.

Differential expression strategy 2

KStest using only the top 50 cells in each of the C.Stem and S.Stem classes with highest Lgr5 and comparing only these classes.

idx <- which(getData(uObj, "classification") %in% c("C.Stem", "SI.Stem"))
counts <- getData(cObjSng, "counts.cpm")[, idx]
classes <- getData(uObj, "classification")[idx]

#find top 50 cells in each class with highest Lgr5
top50 <- tibble(Lgr5 = counts["Lgr5", ], class = classes, sample = colnames(counts)) %>%
group_by(class) %>%
top_n(50, Lgr5) %>%
ungroup()

ks.res2 <- KStest(counts[, match(top50$sample, colnames(counts))], top50$class, cores = 8)
p.ks.res2 <- ks.res2 %>%
filter(p.value < 0.001) %>%
separate(combination, c("Celltype1", "Celltype2"), sep = "-")  %>%
mutate(
  meanCpmCelltype1 = map2_dbl(Celltype1, gene, function(i, g) {
    mean(getData(cObjSng, "counts.cpm")[g, getData(uObj, "classification") == i])
  }),
  meanCpmCelltype2 = map2_dbl(Celltype2, gene, function(i, g) {
    mean(getData(cObjSng, "counts.cpm")[g, getData(uObj, "classification") == i])
  })
) %>%
mutate(log2.fc = log2(meanCpmCelltype1 / meanCpmCelltype2)) %>%
arrange(desc(log2.fc), p.value)

save(p.ks.res2, file = '~/Github/sp.scRNAseq/inst/differentialExpLgr5ColonSI/differentialExpressionStrategy2.rda', compress = "bzip2")
openxlsx::write.xlsx(p.ks.res2, file = "~/Github/sp.scRNAseq/inst/differentialExpLgr5ColonSI/differentialExpressionStrategy2.xlsx")

Number of significant genes detected at alpha 0.05?

ks.res2 %>%
filter(p.value < 0.05) %>%
count(combination)
## # A tibble: 1 x 2
##   combination        n
##   <chr>          <int>
## 1 C.Stem-SI.Stem  1160

Number of significant genes detected at alpha 0.01?

ks.res2 %>%
filter(p.value < 0.01) %>%
count(combination)
## # A tibble: 1 x 2
##   combination        n
##   <chr>          <int>
## 1 C.Stem-SI.Stem   536

Number of significant genes detected at alpha 0.001?

ks.res2 %>%
filter(p.value < 0.001) %>%
count(combination)
## # A tibble: 1 x 2
##   combination        n
##   <chr>          <int>
## 1 C.Stem-SI.Stem   274

Number of significant genes detected at alpha 0.05 and fold change > 2?

p.ks.res2 %>%
filter(log2.fc > 1 | log2.fc < -1) %>%
unite(combination, Celltype1, Celltype2) %>%
count(combination)
## # A tibble: 1 x 2
##   combination        n
##   <chr>          <int>
## 1 C.Stem_SI.Stem   195

Plot some candidates in the context of all cells.

SI stem genes

p.ks.res2 %>%
filter(log2.fc < 1) %>%
arrange(log2.fc, p.value) %>%
slice(1:30) %>%
pull(gene) %>%
plotUnsupervisedMarkers(uObj, cObjSng, .) %>%
plotData() %>%
gather(gene, expression, -(Sample:Colour)) %>%
dotplot()

Colon stem genes

p.ks.res2 %>%
filter(log2.fc > 1) %>%
arrange(desc(log2.fc), p.value) %>%
slice(1:30) %>%
pull(gene) %>%
plotUnsupervisedMarkers(uObj, cObjSng, .) %>%
plotData() %>%
gather(gene, expression, -(Sample:Colour)) %>%
dotplot()

Results were saved to the file “differentialExpressionStrategy2” with the following columns:

Cell type 1: 1st cell type in comparison Cell type 2: 2nd cell type in comparison Gene: the name of the gene Stat: the test statistic from the KS test that was performed for the corresponding gene p.value: the p values from the KS test that was performed for the corresponding gene meanCpmCellType1: mean counts per million for cell type 1 meanCpmCellType2: mean counts per million for cell type 2 Log2.fc: the log 2 fold change comparing the two cell types.

sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.4
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2          seqTools_0.0.0.9000    
##  [3] forcats_0.3.0           stringr_1.3.1          
##  [5] dplyr_0.7.5             purrr_0.2.5            
##  [7] readr_1.1.1             tidyr_0.8.1            
##  [9] tibble_1.4.2            ggplot2_2.2.1          
## [11] tidyverse_1.2.1         ggthemes_3.5.0         
## [13] sp.scRNAseqData_0.0.1.0 sp.scRNAseq_0.0.2.1    
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-137        matrixStats_0.53.1  lubridate_1.7.4    
##  [4] RColorBrewer_1.1-2  httr_1.3.1          rprojroot_1.3-2    
##  [7] tools_3.5.0         backports_1.1.2     utf8_1.1.4         
## [10] R6_2.2.2            lazyeval_0.2.1      BiocGenerics_0.26.0
## [13] colorspace_1.3-2    tidyselect_0.2.4    gridExtra_2.3      
## [16] mnormt_1.5-5        compiler_3.5.0      cli_1.0.0          
## [19] rvest_0.3.2         xml2_1.2.0          labeling_0.3       
## [22] scales_0.5.0        psych_1.8.4         digest_0.6.15      
## [25] foreign_0.8-70      rmarkdown_1.9       pkgconfig_2.0.1    
## [28] htmltools_0.3.6     rlang_0.2.1         readxl_1.1.0       
## [31] rstudioapi_0.7      bindr_0.1.1         jsonlite_1.5       
## [34] mclust_5.4          zip_1.0.0           magrittr_1.5       
## [37] Rcpp_0.12.17        munsell_0.4.3       S4Vectors_0.18.2   
## [40] viridis_0.5.1       stringi_1.2.2       yaml_2.1.19        
## [43] ggraph_1.0.1        MASS_7.3-50         Rtsne_0.13         
## [46] plyr_1.8.4          grid_3.5.0          parallel_3.5.0     
## [49] ggrepel_0.8.0       crayon_1.3.4        udunits2_0.13      
## [52] lattice_0.20-35     haven_1.1.1         hms_0.4.2          
## [55] knitr_1.20          pillar_1.2.3        igraph_1.2.1       
## [58] pso_1.0.3           reshape2_1.4.3      stats4_3.5.0       
## [61] glue_1.2.0          evaluate_0.10.1     modelr_0.1.2       
## [64] tweenr_0.1.5        cellranger_1.1.0    gtable_0.2.0       
## [67] assertthat_0.2.0    ggforce_0.1.2       openxlsx_4.1.0     
## [70] broom_0.4.4         tidygraph_1.1.0     e1071_1.6-8        
## [73] class_7.3-14        viridisLite_0.3.0   units_0.5-1